.libPaths(c("/home/alt355/R-4.2.1/library", .libPaths()))

# Load necessary libraries
library(Biostrings)
library(stringdist)

# Read in the FASTA file
input_fasta <- readDNAStringSet("../reference/Multi_BAC.fa")  # Replace with your FASTA file path

# Get reverse complements (since these will be extracted from Read1)
rev_complements <- reverseComplement(input_fasta)
sequences.rev <- as.character(rev_complements)

# Check sequences.rev validity
if (!is.character(sequences.rev)) {
  stop("Error: 'sequences.rev' must be a character vector.")
}
if (length(sequences.rev) == 0) {
  stop("Error: No sequences found in the FASTA file.")
}

# Compute minimum Hamming distance
min_dist <- min(stringdistmatrix(sequences.rev, method = "hamming"))
cat("Minimum Hamming distance between sequences:", min_dist, "\n")

# Compute the threshold
threshold <- (min_dist - 1) / 2
cat("Hamming distance threshold:", threshold, "\n")

# Function to generate strings within a Hamming distance threshold
generate_within_hamming <- function(sequence, threshold) {
  if (!is.character(sequence)) {
    stop("Error: 'sequence' must be a character string.")
  }
  
  nucleotides <- c("A", "C", "G", "T")
  seq_length <- nchar(sequence)
  sequence_split <- unlist(strsplit(sequence, ""))
  
  all_mutations <- list(sequence)  # Include the original sequence

  for (k in 1:threshold) {
    # Generate all combinations of `k` positions to mutate
    positions <- combn(seq_length, k, simplify = FALSE)
    
    for (pos_set in positions) {
      temp_sequence <- sequence_split  # Copy the sequence
      for (pos in pos_set) {
        for (nt in nucleotides) {
          if (nt != temp_sequence[pos]) {  # Avoid self-substitution
            temp_sequence[pos] <- nt
            mutated_sequence <- paste(temp_sequence, collapse = "")
            all_mutations <- c(all_mutations, mutated_sequence)
          }
        }
      }
    }
  }
  return(unique(all_mutations))  # Remove duplicates
}

# Apply the function to each sequence
output_list <- list()

for (sequence in sequences.rev) {
  cat("Processing sequence:", sequence, "\n")
  mutations <- generate_within_hamming(sequence, threshold)
  output_list[[sequence]] <- mutations
}

# Output results (e.g., save as an RDS or text file)
saveRDS(output_list, file = "WhitelistOutput.rds")
cat("Whitelist generation completed. Results saved to 'WhitelistOutput.rds'\n")
